In [1]:
import streamlit as st
import datetime as dt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import clickhouse_driver
from clickhouse_driver.client import Client
from pandas_profiling import ProfileReport

import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)

clickhouse = Client('85.193.83.20', database = 'hn_launches',
                    user = '{user}', password = '{password}')
2022-06-22 18:17:01.223 INFO    visions.backends: Pandas backend loaded 1.4.0
2022-06-22 18:17:01.278 INFO    visions.backends: Numpy backend loaded 1.22.1
2022-06-22 18:17:01.280 INFO    visions.backends: Pyspark backend NOT loaded
2022-06-22 18:17:01.281 INFO    visions.backends: Python backend loaded

Querying data from Clickhouse¶

In [2]:
#Quering data from Clickhouse
launches_metrics = clickhouse.query_dataframe("""
select l.item_id as item_id, max(date(l.time)) as date, max(name) as name, 
max(yc_batch) as yc_batch, 
toInt8(max(replaceRegexpAll(l.yc_batch, 'S|W| Nonprofit', ''))) as yc_batch_y,
max(short_description) as short_description, 
max(industry) as industry, max(is_oss) as is_oss,
max(length(text)) as text_len,
max(score) as score,
uniq(comment_id) as comments_qty,
uniqIf(comment_id, sentiment == 'pos') as comments_pos_qty,
uniqIf(comment_id, sentiment == 'neg') as comments_neg_qty,
uniqIf(comment_id, sentiment == 'neu') as comments_neu_qty,
max(employees) as employees, max(estimated_revenue) as revenue, max(total_funding) as total_funding, max(github_stars) as github_stars
from hn_launches.launches l
left join hn_launches.comments c on c.item_id = l.item_id
left join hn_launches.company_growth g on g.item_id = l.item_id
group by item_id;
""")

launches_metrics['date'] = pd.to_datetime(launches_metrics['date'])
In [3]:
launches_metrics.info()
launches_metrics.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291 entries, 0 to 290
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   item_id            291 non-null    int64         
 1   date               291 non-null    datetime64[ns]
 2   name               291 non-null    object        
 3   yc_batch           291 non-null    object        
 4   yc_batch_y         291 non-null    int64         
 5   short_description  291 non-null    object        
 6   industry           291 non-null    object        
 7   is_oss             291 non-null    int64         
 8   text_len           291 non-null    int64         
 9   score              291 non-null    int64         
 10  comments_qty       291 non-null    int64         
 11  comments_pos_qty   291 non-null    int64         
 12  comments_neg_qty   291 non-null    int64         
 13  comments_neu_qty   291 non-null    int64         
 14  employees          291 non-null    int64         
 15  revenue            291 non-null    int64         
 16  total_funding      291 non-null    float64       
 17  github_stars       291 non-null    int64         
dtypes: datetime64[ns](1), float64(1), int64(12), object(4)
memory usage: 41.0+ KB
Out[3]:
item_id date name yc_batch yc_batch_y short_description industry is_oss text_len score comments_qty comments_pos_qty comments_neg_qty comments_neu_qty employees revenue total_funding github_stars
0 26468204 2021-03-15 Mezli W21 21 Robotic restaurants that serve healthy fast food Food 0 5637 201 127 69 9 49 15 2430000 0.0 0
1 24158509 2020-08-14 Tella S20 20 Collaborative video editing in the browser SaaS 0 4601 221 81 54 2 25 0 0 0.0 0
2 28877003 2021-10-15 Inflow S21 21 Self-help app for people with adhd Consumer 0 2930 183 249 116 24 109 0 0 0.0 0
3 25991485 2021-02-01 Opstrace S19 19 Open-source datadog DevTools 1 4994 316 114 78 4 32 0 0 0.0 1188
4 23846186 2020-07-15 Legacy S19 19 We help men test and freeze their sperm Consumer 0 3672 193 92 48 5 39 88 12320000 20.2 0
In [4]:
launches_metrics.describe()
Out[4]:
item_id yc_batch_y is_oss text_len score comments_qty comments_pos_qty comments_neg_qty comments_neu_qty employees revenue total_funding github_stars
count 2.910000e+02 291.000000 291.000000 291.000000 291.000000 291.000000 291.000000 291.000000 291.000000 291.000000 2.910000e+02 2.910000e+02 291.000000
mean 2.743650e+07 20.621993 0.099656 3894.986254 140.762887 75.615120 43.615120 4.632302 27.367698 29.292096 6.378193e+06 5.154689e+05 128.189003
std 2.150811e+06 1.300423 0.300057 1367.983439 87.503589 53.233671 27.680996 5.711952 23.012115 135.469709 4.294584e+07 8.793155e+06 1002.542634
min 2.377021e+07 8.000000 0.000000 75.000000 27.000000 6.000000 4.000000 0.000000 1.000000 0.000000 0.000000e+00 0.000000e+00 0.000000
25% 2.607926e+07 20.000000 0.000000 3109.000000 79.000000 37.000000 24.000000 1.000000 11.000000 0.000000 0.000000e+00 0.000000e+00 0.000000
50% 2.742374e+07 21.000000 0.000000 3869.000000 117.000000 61.000000 38.000000 3.000000 20.000000 0.000000 0.000000e+00 0.000000e+00 0.000000
75% 2.914154e+07 21.000000 0.000000 4693.500000 183.500000 92.500000 54.500000 6.000000 35.000000 15.500000 1.475250e+06 0.000000e+00 0.000000
max 3.172484e+07 22.000000 1.000000 8363.000000 617.000000 251.000000 159.000000 32.000000 109.000000 1696.000000 6.529600e+08 1.500000e+08 12753.000000

Data Profiling¶

In [5]:
profile = ProfileReport(launches_metrics, title = 'HN Launches Profiling Report')

profile.to_notebook_iframe()
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Exploratory data analysis¶

In [19]:
metric = 'score'
px.box(launches_metrics, y = metric, color = 'industry', 
       title = f'Boxplot of {metric}')
In [16]:
metric = 'score'
#industries = launches_metrics['industry'].unique().tolist()
#me = [launches_metrics[launches_metrics['industry'] == i][metric].values for i in industries]

fig = ff.create_distplot([launches_metrics[metric].values], [metric], show_hist = False)
#fig = ff.create_distplot(me, industries, show_hist = False)
fig.update_layout(width = 700, height = 600, title = f'Distplot of {metric}')

fig.show()
In [6]:
px.bar(launches_metrics[(launches_metrics['yc_batch_y'].isin([19, 20, 21])) & (~launches_metrics['yc_batch'].str.contains('Nonprofit'))] \
           .groupby(['yc_batch', 'yc_batch_y', 'industry'])['item_id'] \
           .nunique().reset_index().sort_values('yc_batch_y', ascending = False).rename(columns = {'item_id': 'launches'}), 
       x = 'launches', y = 'yc_batch', color = 'industry', title = 'YC Batches Launches')
In [7]:
px.line(launches_metrics[launches_metrics['yc_batch_y'].isin([19, 20, 21])].groupby(pd.Grouper(key = 'date', freq = 'M', axis = 0))['item_id'] \
               .nunique().reset_index().rename(columns = {'item_id': 'launches'}), 
        x = 'date', y = 'launches', title = 'YC Launches')
In [8]:
px.bar(launches_metrics.groupby(['industry'])['score'].mean().reset_index() \
           .sort_values('score', ascending = False),
       x = 'score', y = 'industry', color = 'industry', title = 'Top Scored Industries')
In [9]:
px.bar(launches_metrics.groupby(['industry'])['comments_qty'].mean().reset_index() \
           .sort_values('comments_qty', ascending = False),
       x = 'comments_qty', y = 'industry', color = 'industry', title = 'Top Commented Industries')
In [17]:
px.bar(launches_metrics.groupby(['industry'])['comments_neg_qty'].mean().reset_index() \
           .sort_values('comments_neg_qty', ascending = False),
       x = 'comments_neg_qty', y = 'industry', color = 'industry', title = '(anti)Top Negative Commented Industries')
In [10]:
metric = 'score'
px.bar(launches_metrics.groupby(['name'])[metric].max().reset_index().sort_values(metric, ascending = False)[:100],
       y = metric, x = 'name', title = f'Top Launches by {metric}')
In [36]:
px.bar(launches_metrics.groupby(['name'])[['comments_qty', 'comments_neu_qty', 'comments_pos_qty', 'comments_neg_qty']].max().reset_index() \
           .sort_values('comments_qty', ascending = False)[:100],
       y = ['comments_neu_qty', 'comments_pos_qty', 'comments_neg_qty'], x = 'name', 
       color_discrete_map = {'comments_neu_qty': '#636EFA', 'comments_pos_qty': '#00CC96', 'comments_neg_qty': '#EF553B'},
       title = f'Top Commented Lauhches')
In [13]:
metric_x = 'score'; metric_y = 'comments_pos_qty'; metric_size = 'text_len'
px.scatter(launches_metrics, x = metric_x, y = metric_y, color = 'industry', size = metric_size, 
           title = f'{metric_x} vs {metric_y}, size: {metric_size}')
In [14]:
corr = launches_metrics[['text_len', 'score',
                         'comments_qty', 'comments_pos_qty', 'comments_neg_qty', 
                         'employees', 'revenue', 'total_funding', 'github_stars']].corr().round(2)

go.Figure(go.Heatmap(z = corr.values, x = corr.columns, y = corr.columns, 
                     text = corr.values, texttemplate = '%{text}',
                     colorscale = px.colors.diverging.RdBu,
                     zmin = -1, zmax = 1)).show()
In [ ]: